Little data exploratin and a VERY simple OLS model
import pandas as pd
import pandas_profiling as pp
df = pd.read_csv("../data/raw/train.csv")
df.head()
pr = pp.ProfileReport(df)
pr
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline
Very first model, lets try something realy simple:
feats = pr.description_set['correlations']['spearman']["SalePrice"].sort_values(ascending=False).index[1:11]
target = "SalePrice"
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['GarageYrBlt'].mode()[0])
pp.ProfileReport(df[feats])
X_train, X_test, y_train, y_test = train_test_split(df[feats], df[target], test_size=0.3)
rgr = linear_model.LinearRegression().fit(X_train, y_train)
mean_squared_error(y_test, rgr.predict(X_test))**0.5